library(tidyverse)
library(rvest)
lr_data_url <- "https://en.wikipedia.org/wiki/Logistic_regression"
lr_data_raw <- read_html(x = lr_data_url) %>%
html_elements(".wikitable") %>%
html_table() %>%
.[[1]]
lr_tbl <- lr_data_raw %>%
janitor::clean_names() %>%
pivot_longer(-x1) %>%
mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>%
select(name, 구분, value) %>%
pivot_wider(names_from = 구분, values_from = value) %>%
select(학습시간, 입학여부)
# fs::dir_create("data")
lr_tbl %>%
write_rds("data/lr_tbl.rds")lr_tbl <-read_rds("data/lr_tbl.rds")
lr_tbl %>%
reactable::reactable()lr_tbl %>%
skimr::skim()| Name | Piped data |
| Number of rows | 20 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| 학습시간 | 0 | 1 | 2.79 | 1.51 | 0.5 | 1.69 | 2.62 | 4.06 | 5.5 | ▇▇▆▅▅ |
| 입학여부 | 0 | 1 | 0.50 | 0.51 | 0.0 | 0.00 | 0.50 | 1.00 | 1.0 | ▇▁▁▁▇ |
lr_tbl %>%
ggplot(aes(x = 학습시간, y = 입학여부)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)adm_lr <- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)
adm_lr
Call: glm(formula = 입학여부 ~ 학습시간, family = "binomial",
data = lr_tbl)
Coefficients:
(Intercept) 학습시간
-4.078 1.505
Degrees of Freedom: 19 Total (i.e. Null); 18 Residual
Null Deviance: 27.73
Residual Deviance: 16.06 AIC: 20.06
library(crosstalk)
crosstalk_lr_raw <- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )
crosstalk_lr_tbl <- crosstalk_lr_raw %>%
mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>%
left_join(lr_tbl) %>%
mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )
crosstalk_lr_g <- crosstalk_lr_tbl %>%
ggplot(aes(x = 학습시간, y = 합격확률) ) +
geom_point() +
geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
geom_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)
plotly::ggplotly(crosstalk_lr_g )최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.
\[ NLL(y) = -{\log(p(y))} \]
\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]
\[ \max_{\theta} \prod_y p(y;\theta) \]
sigmoid_fn <- function(x){
1/(1+exp(-x))
}
neg_log_likelihood <- function(par, data, y, include_alpha = T) {
# 출력변수 정의
x <- data[,names(data) != y]
y_data <- data[,y]
# 1. 선형결합
if( include_alpha ){
# 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
linear_combination <- mapply("*", x, par[2:length(par)])
# 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
theta <- rowSums(linear_combination) + par[1]
} else {
theta <- rowSums(mapply("*", x, par))
}
# 2. 확률 계산
p <- sigmoid_fn(theta)
# p <- exp(theta) / (1 + exp(theta))
# 3. 우도값 계산: -log likelihood
value <- - sum(y_data * log(p) + (1-y_data)*log(1-p))
return(value)
}
library(optimx)
lr_opt <- optimx(
par = c(0,0),
fn = neg_log_likelihood,
data = lr_tbl,
y = "입학여부",
method = "Nelder-Mead",
include_alpha = TRUE,
itnmax = 100,
control = list(trace = TRUE, all.methods = FALSE)
)fn is fn
Looking for method = Nelder-Mead
Function has 2 arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf log bounds ratio= NA
Method: Nelder-Mead
Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD 3 13.887933 13.096825
EXTENSION 5 13.862944 12.582216
EXTENSION 7 13.096825 12.067907
EXTENSION 9 12.582216 11.285614
LO-REDUCTION 11 12.067907 11.285614
LO-REDUCTION 13 11.874408 11.285614
EXTENSION 15 11.469089 10.348151
LO-REDUCTION 17 11.285614 10.348151
EXTENSION 19 10.370274 8.914547
LO-REDUCTION 21 10.348151 8.914547
EXTENSION 23 9.266657 8.226456
REFLECTION 25 8.914547 8.030679
LO-REDUCTION 27 8.259889 8.030679
LO-REDUCTION 29 8.226456 8.030679
LO-REDUCTION 31 8.114076 8.030679
HI-REDUCTION 33 8.053435 8.030679
HI-REDUCTION 35 8.052924 8.030679
LO-REDUCTION 37 8.038012 8.030679
HI-REDUCTION 39 8.033349 8.030679
LO-REDUCTION 41 8.031439 8.030144
HI-REDUCTION 43 8.030679 8.030087
HI-REDUCTION 45 8.030144 8.029950
HI-REDUCTION 47 8.030087 8.029938
HI-REDUCTION 49 8.029950 8.029917
HI-REDUCTION 51 8.029938 8.029881
LO-REDUCTION 53 8.029917 8.029881
HI-REDUCTION 55 8.029890 8.029881
LO-REDUCTION 57 8.029889 8.029880
HI-REDUCTION 59 8.029881 8.029880
HI-REDUCTION 61 8.029880 8.029879
HI-REDUCTION 63 8.029880 8.029879
REFLECTION 65 8.029879 8.029879
Exiting from Nelder Mead minimizer
67 function evaluations used
Post processing for method Nelder-Mead
Successful convergence!
Compute Hessian approximation at finish of Nelder-Mead
Compute gradient approximation at finish of Nelder-Mead
Save results from method Nelder-Mead
$par
[1] -4.076953 1.504453
$value
[1] 8.029879
$message
NULL
$convcode
[1] 0
$fevals
function
67
$gevals
gradient
NA
$nitns
[1] NA
$kkt1
[1] TRUE
$kkt2
[1] TRUE
$xtimes
user.self
0.08
Assemble the answers
lr_opt[, 1:5] %>%
as.data.frame() %>%
rownames_to_column("method") %>%
filter(method == "Nelder-Mead") %>%
select(method, p1, p2) method p1 p2
1 Nelder-Mead -4.076953 1.504453
glm() 함수로 구현한 것과 값이 동일한지 상호확인한다.
adm_lr$coefficients(Intercept) 학습시간
-4.077713 1.504645
predict_fn <- function(x, par){
if( ncol(x) < length(par) ){
theta <- rowSums(mapply("*", x, par[2:length(par)])) + as.numeric(par[1])
} else {
theta <- rowSums(mapply("*", x, par))
}
prob <- sigmoid_fn(theta)
return(prob)
}
custom_pred <- predict_fn(lr_tbl %>% select(학습시간), lr_opt[, 1:2])
lr_tbl %>%
mutate(자체제작_예측 = custom_pred) %>%
mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>%
reactable::reactable()데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com